## Setup
import pandas as pd
import numpy as np
import plotly.express as px
#used to create training and test sets
from sklearn.model_selection import ShuffleSplit
# run logistic regression and SVM and vary some parameters
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn import metrics as mt
from sklearn.metrics import accuracy_score
#use this to standardize the weights of the parameters as large values will give us large results
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
# load the Framingham dataset
df = pd.read_csv('https://raw.githubusercontent.com/dk28yip/MSDS7331/main/framingham.csv') # read in the csv file
df.head()
| male | age | education | currentSmoker | cigsPerDay | BPMeds | prevalentStroke | prevalentHyp | diabetes | totChol | sysBP | diaBP | BMI | heartRate | glucose | TenYearCHD | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 39 | 4.0 | 0 | 0.0 | 0.0 | 0 | 0 | 0 | 195.0 | 106.0 | 70.0 | 26.97 | 80.0 | 77.0 | 0 |
| 1 | 0 | 46 | 2.0 | 0 | 0.0 | 0.0 | 0 | 0 | 0 | 250.0 | 121.0 | 81.0 | 28.73 | 95.0 | 76.0 | 0 |
| 2 | 1 | 48 | 1.0 | 1 | 20.0 | 0.0 | 0 | 0 | 0 | 245.0 | 127.5 | 80.0 | 25.34 | 75.0 | 70.0 | 0 |
| 3 | 0 | 61 | 3.0 | 1 | 30.0 | 0.0 | 0 | 1 | 0 | 225.0 | 150.0 | 95.0 | 28.58 | 65.0 | 103.0 | 1 |
| 4 | 0 | 46 | 3.0 | 1 | 23.0 | 0.0 | 0 | 0 | 0 | 285.0 | 130.0 | 84.0 | 23.10 | 85.0 | 85.0 | 0 |
#Look at data
#following code will describe the data
df.describe()
| male | age | education | currentSmoker | cigsPerDay | BPMeds | prevalentStroke | prevalentHyp | diabetes | totChol | sysBP | diaBP | BMI | heartRate | glucose | TenYearCHD | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4238.000000 | 4238.000000 | 4133.000000 | 4238.000000 | 4209.000000 | 4185.000000 | 4238.000000 | 4238.000000 | 4238.000000 | 4188.000000 | 4238.000000 | 4238.000000 | 4219.000000 | 4237.000000 | 3850.000000 | 4238.000000 |
| mean | 0.429212 | 49.584946 | 1.978950 | 0.494101 | 9.003089 | 0.029630 | 0.005899 | 0.310524 | 0.025720 | 236.721585 | 132.352407 | 82.893464 | 25.802008 | 75.878924 | 81.966753 | 0.151958 |
| std | 0.495022 | 8.572160 | 1.019791 | 0.500024 | 11.920094 | 0.169584 | 0.076587 | 0.462763 | 0.158316 | 44.590334 | 22.038097 | 11.910850 | 4.080111 | 12.026596 | 23.959998 | 0.359023 |
| min | 0.000000 | 32.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 107.000000 | 83.500000 | 48.000000 | 15.540000 | 44.000000 | 40.000000 | 0.000000 |
| 25% | 0.000000 | 42.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 206.000000 | 117.000000 | 75.000000 | 23.070000 | 68.000000 | 71.000000 | 0.000000 |
| 50% | 0.000000 | 49.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 234.000000 | 128.000000 | 82.000000 | 25.400000 | 75.000000 | 78.000000 | 0.000000 |
| 75% | 1.000000 | 56.000000 | 3.000000 | 1.000000 | 20.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 263.000000 | 144.000000 | 89.875000 | 28.040000 | 83.000000 | 87.000000 | 0.000000 |
| max | 1.000000 | 70.000000 | 4.000000 | 1.000000 | 70.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 696.000000 | 295.000000 | 142.500000 | 56.800000 | 143.000000 | 394.000000 | 1.000000 |
#we will remove current smoker since cigsPerDay are correlated to being a smoker
df = df.drop('currentSmoker', axis=1)
print (df.dtypes)
print (df.info())
male int64 age int64 education float64 cigsPerDay float64 BPMeds float64 prevalentStroke int64 prevalentHyp int64 diabetes int64 totChol float64 sysBP float64 diaBP float64 BMI float64 heartRate float64 glucose float64 TenYearCHD int64 dtype: object <class 'pandas.core.frame.DataFrame'> RangeIndex: 4238 entries, 0 to 4237 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 male 4238 non-null int64 1 age 4238 non-null int64 2 education 4133 non-null float64 3 cigsPerDay 4209 non-null float64 4 BPMeds 4185 non-null float64 5 prevalentStroke 4238 non-null int64 6 prevalentHyp 4238 non-null int64 7 diabetes 4238 non-null int64 8 totChol 4188 non-null float64 9 sysBP 4238 non-null float64 10 diaBP 4238 non-null float64 11 BMI 4219 non-null float64 12 heartRate 4237 non-null float64 13 glucose 3850 non-null float64 14 TenYearCHD 4238 non-null int64 dtypes: float64(9), int64(6) memory usage: 496.8 KB None
A total of 4,238 patient records. The following full season statistics (columns of continous variables) will be used
#check for NA
df.isnull().sum()
male 0 age 0 education 105 cigsPerDay 29 BPMeds 53 prevalentStroke 0 prevalentHyp 0 diabetes 0 totChol 50 sysBP 0 diaBP 0 BMI 19 heartRate 1 glucose 388 TenYearCHD 0 dtype: int64
# Any missing values in the dataset
def plot_missingness(df: pd.DataFrame=df) -> None:
nan_df = pd.DataFrame(df.isna().sum()).reset_index()
nan_df.columns = ['Column', 'NaN_Count']
nan_df['NaN_Count'] = nan_df['NaN_Count'].astype('int')
nan_df['NaN_%'] = round(nan_df['NaN_Count']/df.shape[0] * 100,1)
nan_df['Type'] = 'Missingness'
nan_df.sort_values('NaN_%', inplace=True)
# Add completeness
for i in range(nan_df.shape[0]):
complete_df = pd.DataFrame([nan_df.loc[i,'Column'],df.shape[0] - nan_df.loc[i,'NaN_Count'],100 - nan_df.loc[i,'NaN_%'], 'Completeness']).T
complete_df.columns = ['Column','NaN_Count','NaN_%','Type']
complete_df['NaN_%'] = complete_df['NaN_%'].astype('int')
complete_df['NaN_Count'] = complete_df['NaN_Count'].astype('int')
nan_df = pd.concat([nan_df,complete_df], sort=True)
nan_df = nan_df.rename(columns={"Column": "Feature", "NaN_%": "Missing %"})
# Missingness Plot
fig = px.bar(nan_df,
x='Feature',
y='Missing %',
title=f"Missingness Plot (N={df.shape[0]})",
color='Type',
opacity = 0.6,
color_discrete_sequence=['red','#808080'],
width=800,
height=400)
fig.show()
plot_missingness(df)
Since our data is 4238 and we have 388 missing in glucose or less than 10%, we will remove the NAs from the data set
#drop NAs and check to ensure there are no more NAs
df=df.dropna()
df.isnull().sum()
male 0 age 0 education 0 cigsPerDay 0 BPMeds 0 prevalentStroke 0 prevalentHyp 0 diabetes 0 totChol 0 sysBP 0 diaBP 0 BMI 0 heartRate 0 glucose 0 TenYearCHD 0 dtype: int64
For training and testing purposes, let's gather the data we have and grab 80% of the instances for training and the remaining 20% for testing. Moreover, let's repeat this process of separating the testing and training data twice. We will use the hold out cross validation method built into scikit-learn.
# we want to predict the X and y data as follows:
if 'TenYearCHD' in df:
y = df['TenYearCHD'].values # get the labels we want
del df['TenYearCHD'] # get rid of the class label
X = df.values # use everything else to predict!
## X and y are now numpy matrices, by calling 'values' on the pandas data frames we
# have converted them into simple matrices to use with scikit learn
# to use the cross validation object in scikit learn, we need to grab an instance
# of the object and set it up. This object will be able to split our data into
# training and testing splits
num_cv_iterations = 3
num_instances = len(y)
cv_object = ShuffleSplit(n_splits=num_cv_iterations,
test_size = 0.2)
print(cv_object)
ShuffleSplit(n_splits=3, random_state=None, test_size=0.2, train_size=None)
# Setup and Create reusable learning parameters and constants for logisitic regression object
#C is set at 0.05 because we will standardize the data
lr_clf = LogisticRegression(penalty='l2', C=0.05, class_weight=None, solver='liblinear' ) # get object
# Code to run logistic Regression
%matplotlib inline
plt.style.use('ggplot')
#declare iteration number value
iter_num=0
np.random.seed(0)
for iter_num, (train_indices, test_indices) in enumerate(cv_object.split(X,y)):
scl_obj = StandardScaler()
scl_obj.fit(X) # find scalings for each column that make this zero mean and unit std
X_scaled = scl_obj.transform(X) # apply to training
lr_clf.fit(X_scaled[train_indices],y[train_indices]) # train object
y_hat = lr_clf.predict(X_scaled[test_indices]) # get test set precitions
# sort these attributes and spit them out
zip_vars = zip(lr_clf.coef_.T,df) # combine attributes
zip_vars = sorted(zip_vars)
# print the accuracy and confusion matrix
print("====Iteration",iter_num," ====")
print("accuracy", mt.accuracy_score(y[test_indices],y_hat))
print("confusion matrix\n",mt.confusion_matrix(y[test_indices],y_hat))
====Iteration 0 ==== accuracy 0.8524590163934426 confusion matrix [[617 6] [102 7]] ====Iteration 1 ==== accuracy 0.8387978142076503 confusion matrix [[607 7] [111 7]] ====Iteration 2 ==== accuracy 0.8510928961748634 confusion matrix [[616 0] [109 7]]
We complete three iterations of the training and test sets and our accuracy to determining an patients chances of getting heart disease in the next 10 years ranges between 83.8% to 85.2%.
# Same code to run logistic Regression but for the weights of the logistic regression
%matplotlib inline
plt.style.use('ggplot')
#declare iteration number value
iter_num=0
np.random.seed(0)
for iter_num, (train_indices, test_indices) in enumerate(cv_object.split(X,y)):
scl_obj = StandardScaler()
scl_obj.fit(X) # find scalings for each column that make this zero mean and unit std
X_scaled = scl_obj.transform(X) # apply to training
lr_clf.fit(X_scaled[train_indices],y[train_indices]) # train object
y_hat = lr_clf.predict(X_scaled[test_indices]) # get test set precitions
# sort these attributes and spit them out
zip_vars = zip(lr_clf.coef_.T,df) # combine attributes
zip_vars = sorted(zip_vars)
# print the accuracy and confusion matrix
print("====Iteration",iter_num," ====")
weights = pd.Series(lr_clf.coef_[0],index=df.columns)
weights.plot(kind='bar')
plt.show()
for coef, name in zip_vars:
print(name, 'has weight of', coef[0]) # now print them out
====Iteration 0 ====
education has weight of -0.06289061378285161 diaBP has weight of -0.024698183162393833 heartRate has weight of -0.004383913052116499 diabetes has weight of 0.025219192488851124 BMI has weight of 0.030032036904813514 BPMeds has weight of 0.03527524566279782 prevalentStroke has weight of 0.07488213198913618 prevalentHyp has weight of 0.07753066833162651 totChol has weight of 0.11823216531101302 glucose has weight of 0.13635491121400395 cigsPerDay has weight of 0.19339452061296883 male has weight of 0.2514317491277591 sysBP has weight of 0.2972920992539578 age has weight of 0.4481017618333452 ====Iteration 1 ====
diaBP has weight of -0.11170914094141934 heartRate has weight of -0.06543602353720188 education has weight of -0.026541145762055438 BPMeds has weight of 0.0007766354246441511 BMI has weight of 0.029089275959719357 diabetes has weight of 0.029795101508091042 prevalentStroke has weight of 0.07554196384060148 totChol has weight of 0.09325992514206269 prevalentHyp has weight of 0.12508381459245213 glucose has weight of 0.14760124459469828 cigsPerDay has weight of 0.16799095162717773 male has weight of 0.28153641014915864 sysBP has weight of 0.37646384451938336 age has weight of 0.44891595263409967 ====Iteration 2 ====
heartRate has weight of -0.04684585822327157 education has weight of -0.044915200580612796 diaBP has weight of -0.01932217679107812 BMI has weight of 0.008900992027365613 diabetes has weight of 0.015462282451624253 prevalentStroke has weight of 0.04944140850361994 BPMeds has weight of 0.051082542938597986 prevalentHyp has weight of 0.079445187630391 totChol has weight of 0.10411104489614763 glucose has weight of 0.1181490621677388 cigsPerDay has weight of 0.18297858198604106 male has weight of 0.2567047385522673 sysBP has weight of 0.2927028467849695 age has weight of 0.43889298684282385
Based on the 3 iterations we ran, we determine that there are 5 key variables that will result into a higher probability of being diagnoised with heart disease. The 5 factors are noted from highest to lowest. The variable weights signifies the “odds” (probability of event divided by probability of no event) of heart disease.
- Age (43.8% to 44.8% odds developing heart disease)- The older the individual the higher the likelyhood of an invididual developing heart disease. This make sense as one ages the bodily functions start to detoriate over time which will lead to less resistance to prevent diseases.
- Systolic pressure (29.2% to 37.6% odds developing heart disease) - The higher the pressure to pump blood through the body results in the over working of the heart. Over time if this pressure maintains the odds in developing heart diease increases.
- Gender being male (25.1% to 28.2% odds developing heart disease) - Genetically males have a higher odds developing heart disease.
- Cigarettes per day (16.7% to 19.3% odds developing heart diesase) - Smokers who smoke more cigarettes increases odds with developing heart disease.
- Glucose levels (11.8% to 14.7% odds developing heart diesase) - indidivuals with higher glucose levels have higher odds with developing heart disease.
for train_indices, test_indices in cv_object.split(X,y):
X_train = X[train_indices]
y_train = y[train_indices]
X_test = X[test_indices]
y_test = y[test_indices]
X_train_scaled = scl_obj.transform(X_train) # apply to training
X_test_scaled = scl_obj.transform(X_test)
# check how C changes with accruacy prediction for the linear SVM
accuracy, params = [], []
for c in np.arange(1, 10):
svm_clf_linear = SVC(kernel = 'linear', C= 0.1*c)
svm_clf_linear.fit(X_train_scaled,y_train)
y_hat = svm_clf_linear.predict(X_test_scaled)
accuracy.append(accuracy_score(y_test, y_hat))
params.append(0.1*c)
accuracy = np.array(accuracy)
plt.plot(params, accuracy)
plt.ylabel('accuracy of prediction')
plt.xlabel('C')
plt.legend(loc='upper left')
#plt.xscale('log')
plt.show()
No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
Based on the sensitivity analysis of C for linear SVM model, we note c is a parameter that is not sensitive and will use c=1 as our our c parameter in our linear SVM model.
svm_clf_linear = SVC(kernel = 'linear', C= 1)
svm_clf_linear.fit(X_train,y_train)
y_hat =svm_clf_linear.predict(X_test)
acc = mt.accuracy_score(y_test,y_hat)
conf = mt.confusion_matrix(y_test,y_hat)
print('Predication Accuracy:', acc )
print(conf)
Predication Accuracy: 0.8456284153005464 [[619 0] [113 0]]
#print the support vector shape for the linear SVM
print('Support Vectors Shape:',svm_clf_linear.support_vectors_.shape)
Support Vectors Shape: (899, 14)
accuracy, params = [], []
for c in np.arange(1, 11):
svm_clf_nonlinear = SVC(C=0.1*c, kernel='rbf', gamma='auto')
svm_clf_nonlinear.fit(X_train_scaled,y_train)
y_hat = svm_clf_nonlinear.predict(X_test_scaled)
accuracy.append(accuracy_score(y_test, y_hat))
params.append(0.1*c)
accuracy = np.array(accuracy)
plt.plot(params, accuracy)
plt.ylabel('accuracy of prediction')
plt.xlabel('C')
plt.legend(loc='upper left')
plt.show()
No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
Based on the sensitivity analysis of C for nonlinear SVM scaled model, we note c is a parameter is best used at c=0.6 for our our in our nonlinear scaled SVM model.
#based on our sensitvity analysis we will use c=0.6 for the nonlinear rbf model.
# train the model just as before
svm_clf_nonlinear = SVC(C=0.6, kernel='rbf', degree=3, gamma='auto') # get object
svm_clf_nonlinear.fit(X_train_scaled, y_train) # train object
y_hat = svm_clf_nonlinear.predict(X_test_scaled) # get test set precitions
acc = mt.accuracy_score(y_test,y_hat)
conf = mt.confusion_matrix(y_test,y_hat)
print('Non-Linear SVM accuracy:', acc )
print(conf)
Non-Linear SVM accuracy: 0.8456284153005464 [[619 0] [113 0]]
# look at the support vector for the nonlinear SVM
print('Support Vectors Shape:', svm_clf_nonlinear.support_vectors_.shape)
print(svm_clf_nonlinear.support_.shape)
print(svm_clf_nonlinear.n_support_ )
Support Vectors Shape: (1229, 14) (1229,) [785 444]
# Now let's do some different analysis with the SVM and look at the instances that were chosen as support vectors
# now lets look at the support for the vectors and see if we they are indicative of anything
# grab the rows that were selected as support vectors (these are usually instances that are hard to classify)
# make a dataframe of the training data
df_tested_on = df.iloc[train_indices].copy() # saved from above, the indices chosen for training
# now get the support vectors from the trained model
df_support = df_tested_on.iloc[svm_clf_nonlinear.support_,:].copy()
df_support['TenYearCHD'] = y[svm_clf_nonlinear.support_] # add back in the 'Survived' Column to the pandas dataframe
df['TenYearCHD'] = y # also add it back in for the original data
df_support.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1229 entries, 3547 to 2137 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 male 1229 non-null int64 1 age 1229 non-null int64 2 education 1229 non-null float64 3 cigsPerDay 1229 non-null float64 4 BPMeds 1229 non-null float64 5 prevalentStroke 1229 non-null int64 6 prevalentHyp 1229 non-null int64 7 diabetes 1229 non-null int64 8 totChol 1229 non-null float64 9 sysBP 1229 non-null float64 10 diaBP 1229 non-null float64 11 BMI 1229 non-null float64 12 heartRate 1229 non-null float64 13 glucose 1229 non-null float64 14 TenYearCHD 1229 non-null int64 dtypes: float64(9), int64(6) memory usage: 153.6 KB
C:\Datascience\Anaconda3\envs\ML7331\lib\site-packages\ipykernel_launcher.py:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# now lets see the statistics of these attributes
from pandas.plotting import boxplot
# group the original data and the support vectors
df_grouped_support = df_support.groupby(['TenYearCHD'])
df_grouped = df.groupby(['TenYearCHD'])
# plot KDE of Different variables
vars_to_plot = ['male','age','sysBP','cigsPerDay']
for v in vars_to_plot:
plt.figure(figsize=(10,4))
# plot support vector stats
plt.subplot(1,2,1)
ax = df_grouped_support[v].plot.kde()
plt.legend(['At Risk','Low Risk'])
plt.title(v+' (Instances chosen as Support Vectors)')
# plot original distributions
plt.subplot(1,2,2)
ax = df_grouped[v].plot.kde()
plt.legend(['At Risk','Low Risk'])
plt.title(v+' (Original)')
Both the linear and non linear SVMs yielded the same accruacy 84.56%. Based on the graphs the support vectors did not yield much insight. In addition, the number support vectors in both the Linear and Nonlinear models, (899,14) and (1229,14) respectively, make it hard to interpret. However, given that the linear SVM has less support vectors to yield the same classification accruacy of 84.56%, we conclude that the data can be separated in a linear fashion.